In [72]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
In [73]:
titanic_df = pd.read_csv('titanic.csv')
In [74]:
# Randomly sort the data, this'll be important later on
titanic_df = titanic_df.sample(frac = 1)
In [75]:
titanic_df.reset_index(drop=True, inplace=True)
In [76]:
titanic_df
Out[76]:
In [77]:
#Select a subset of the columns
titanic_df = titanic_df[['pclass', 'survived', 'sex', 'age', 'fare']]
In [78]:
#Fill the NaN with -1
titanic_df.fillna(-1, inplace = True)
In [79]:
#A dictionary in python
sex_dict = {'male': 0, 'female': 1, -1: -1}
In [80]:
#Our first look at the magic of "apply"
titanic_df.sex = titanic_df.sex.apply(lambda x: sex_dict[x])
In [81]:
#How bad is our missing data situation?
for field in titanic_df.columns:
print field, sum(titanic_df[field] == -1)
In [82]:
#Is the fact that age is not reported relevant?
no_age_df = titanic_df[titanic_df.age == -1]
yes_age_df = titanic_df[titanic_df.age != -1]
In [83]:
for field in ['fare', 'sex', 'survived', 'pclass']:
print field
print 'Missing Age: ', no_age_df[field].mean()
print 'Present Age: ', yes_age_df[field].mean()
In [84]:
#Make a decision about what to do about NaN
titanic_df = titanic_df[(titanic_df.age != -1)&(titanic_df.survived != -1)&(titanic_df.sex != -1)&(titanic_df.fare != -1)&(titanic_df.pclass != -1)]
#More Elegant
#titanic_df = titanic_df[(titanic_df.T != -1).all()]
In [85]:
titanic_df
Out[85]:
In [86]:
#Again reset the index
titanic_df.reset_index(drop=True, inplace = True)
In [87]:
#Set up our correlation matrix
correlation_matrix =np.zeros(shape=(5,5))
In [88]:
correlation_matrix
Out[88]:
In [89]:
#Populate it
for i, field1 in enumerate(titanic_df.columns):
for j, field2 in enumerate(titanic_df.columns):
correlation_matrix[i,j] = pearsonr(titanic_df[field1], titanic_df[field2])[0]
In [90]:
titanic_df.columns
Out[90]:
In [91]:
correlation_matrix
Out[91]:
In [92]:
#Set figure size
plt.figure(figsize=(10,8))
#Specify we would like a heatmap
plt.imshow(correlation_matrix, interpolation = 'nearest', cmap = 'Greys')
#Specify the x and y labels
plt.xticks(range(5), titanic_df.columns, rotation = 90, fontsize = 16)
plt.yticks(range(5), titanic_df.columns, fontsize = 16)
Out[92]:
In [93]:
#for each column, draw a historgram of the distribution
for field in titanic_df.columns:
plt.clf()
plt.hist(titanic_df[field], color = np.random.rand(3,1))
plt.title(field)
plt.show()
In [94]:
#Further subset the dataframe
titanic_df = titanic_df[['sex', 'age', 'fare', 'survived']]
In [95]:
titanic_df
Out[95]:
In [96]:
#Normalize age
titanic_df['n_age'] = titanic_df.age.apply(lambda x: (x-titanic_df.age.mean())/titanic_df.age.std())
In [97]:
#Take the log of fare
titanic_df['logfare'] = titanic_df.fare.apply(lambda x: np.log(x))
In [98]:
#Draw the histogram of logfare
plt.hist(titanic_df[np.isfinite(titanic_df.logfare)].logfare, color = np.random.rand(3,1))
Out[98]:
In [99]:
#a log transformation will cause 0 --> -infinify
titanic_df = titanic_df[np.isfinite(titanic_df.logfare)]
In [100]:
#Now normalize the log of fare
titanic_df['n_logfare'] = titanic_df.logfare.apply(lambda x: (x-titanic_df.logfare.mean())/titanic_df.logfare.std())
In [101]:
titanic_df
Out[101]:
In [102]:
#Create the dataframe we will use for machine learning
sim_df = titanic_df[['survived', 'sex', 'n_age', 'n_logfare']]
In [103]:
#Randomly sample 600 people from the dataset
lim_sim_df = sim_df.sample(600).reset_index()
In [104]:
#initialize our similarity matrix
sim_mtx = np.zeros(shape=(len(lim_sim_df), len(lim_sim_df)))
In [105]:
#Get a list of who survived and who didn't from our 600
surv_list = lim_sim_df.survived
In [106]:
%%time
#populate the similarity matrix
for i in range(len(sim_mtx)):
if i%100 == 0:
print i
v1 = lim_sim_df.iloc[i]
for j in range(i, len(sim_mtx)):
norm = np.exp(-np.linalg.norm(v1 - lim_sim_df.iloc[j]))
sim_mtx[i,j] = norm
sim_mtx[j,i] = norm
In [107]:
#our test set will be 15%
pred_size = int(0.15*len(sim_mtx))
print pred_size
In [108]:
#make our predictions based on a majority of the relevant neighbors
pred_manual = []
for i in range(pred_size):
indices_to_use = sorted(range(pred_size, len(sim_mtx)), key=lambda j: sim_mtx[i][j])
indices_to_use = indices_to_use[-149:]
sim_list = surv_list[indices_to_use].tolist()
mode = max(set(sim_list), key=sim_list.count)
pred_manual.append(mode)
In [109]:
sum(((pred_manual - surv_list[:pred_size]) == 0))/float(pred_size)
Out[109]:
In [110]:
#initialize the KNN
neigh = KNeighborsClassifier(n_neighbors=149)
In [111]:
pred_size = int(0.15*len(titanic_df))
auto_surv_list = titanic_df.survived
print pred_size
In [112]:
for feat_list in [['sex'], ['age'], ['fare'], ['sex', 'age'], ['sex', 'fare'], ['age', 'fare'], ['sex', 'fare', 'age']]:
#fit the model with the training data
neigh.fit(titanic_df[feat_list][pred_size:].as_matrix(), titanic_df['survived'][pred_size:])
pred_auto = neigh.predict(titanic_df[feat_list][:pred_size].as_matrix())
print feat_list
print sum(((pred_auto - auto_surv_list[:pred_size]) == 0))/float(pred_size)
In [113]:
#Graph accuracy vs k for our manual KNN
k_list = []
pred_size = int(0.15*len(sim_mtx))
for k in range(1, 200):
pred_manual = []
for i in range(pred_size):
sim_list = surv_list[sorted(range(pred_size, len(sim_mtx)), key=lambda j: sim_mtx[i][j])[-k:]].tolist()
pred_manual.append(max(set(sim_list), key=sim_list.count))
acc = sum(((pred_manual - surv_list[:pred_size]) == 0))/float(pred_size)
k_list.append(acc)
plt.figure(figsize=(10,8))
plt.plot(range(1,200), k_list)
Out[113]:
In [114]:
#Graph accuracy vs k for SKL KNN
k_list_auto = []
pred_size = int(0.15*len(titanic_df))
feat_list = ['sex', 'age', 'fare']
for k in range(1,800):
neigh = KNeighborsClassifier(n_neighbors=k)
neigh.fit(titanic_df[feat_list][pred_size:].as_matrix(), titanic_df['survived'][pred_size:])
pred_auto = neigh.predict(titanic_df[feat_list][:pred_size].as_matrix())
acc = sum(((pred_auto - auto_surv_list[:pred_size]) == 0))/float(pred_size)
k_list_auto.append(acc)
plt.figure(figsize=(10,8))
plt.plot(range(1,800), k_list_auto, color = 'r')
Out[114]:
In [115]:
#Side by side
plt.figure(figsize=(10,8))
plt.plot(range(1,800), k_list_auto, color = 'red')
plt.plot(range(1,200), k_list, color = 'blue')
plt.axhline(0.62, color = 'k', linewidth = 1.5)
Out[115]:
In [116]:
#define precision and recall function
def precision_recall(pred, true):
pred = np.asarray(pred)
true = np.asarray(true)
if (sum(pred+true == 2) + sum(pred-true == 1)) != 0:
precision = float(sum(pred+true == 2))/(sum(pred+true == 2) + sum(pred-true == 1))
else:
precision = 0
if (sum(pred+true == 2) + sum(pred-true == -1)) != 0:
recall = float(sum(pred+true == 2))/(sum(pred+true == 2) + sum(pred-true == -1))
else:
recall = 0
return (precision, recall)
In [117]:
#Graph precision and recall vs k for SKL KNN
k_list_auto = []
pred_size = int(0.15*len(titanic_df))
feat_list = ['sex', 'age', 'fare']
for k in range(1,550):
neigh = KNeighborsClassifier(n_neighbors=k)
neigh.fit(titanic_df[feat_list][pred_size:].as_matrix(), titanic_df['survived'][pred_size:])
pred_auto = neigh.predict(titanic_df[feat_list][:pred_size].as_matrix())
p_r = precision_recall(pred_auto, auto_surv_list[:pred_size])
k_list_auto.append(p_r)
plt.figure(figsize=(10,8))
plt.plot(range(1,550), [i[0] for i in k_list_auto], color = 'r')
plt.plot(range(1,550), [i[1] for i in k_list_auto], color = 'g')
plt.axhline(0.32, color = 'red', linewidth=2, alpha = 0.5)
Out[117]:
In [118]:
#A magical loop
pred_size = int(0.15*len(titanic_df))
feat_list = ['sex', 'age', 'fare']
clfs = {
'RF': RandomForestClassifier(),
'LR': LogisticRegression(),
'GB': GradientBoostingClassifier(),
'ET': ExtraTreesClassifier(),
'KNN': KNeighborsClassifier(n_neighbors=300),
'AB': AdaBoostClassifier()
}
for clf_name in clfs.keys():
print clf_name
clf = clfs[clf_name]
clf.fit(titanic_df[feat_list][pred_size:].as_matrix(), titanic_df['survived'][pred_size:])
pred_auto = clf.predict(titanic_df[feat_list][:pred_size].as_matrix())
acc = sum(((pred_auto - auto_surv_list[:pred_size]) == 0))/float(pred_size)
print 'Accuracy: ', acc
p_r = precision_recall(pred_auto, auto_surv_list[:pred_size])
print 'Precision: ', p_r[0]
print 'Recall: ', p_r[1]
print '----------------------------------------------'
In [119]:
#WRONG WRONG WRONG!!!!!!
pred_size = int(0.15*len(titanic_df))
feat_list = ['sex', 'age', 'fare']
clfs = {
'RF': RandomForestClassifier(),
'LR': LogisticRegression(),
'GB': GradientBoostingClassifier(),
'ET': ExtraTreesClassifier(),
'KNN': KNeighborsClassifier(),
'AB': AdaBoostClassifier()
}
for clf_name in clfs.keys():
print clf_name + ' - WRONG!'
clf = clfs[clf_name]
clf.fit(titanic_df[feat_list].as_matrix(), titanic_df['survived'])
pred_auto = clf.predict(titanic_df[feat_list].as_matrix())
acc = sum(((pred_auto - auto_surv_list) == 0))/float(len(titanic_df))
print 'Accuracy: ', acc
p_r = precision_recall(pred_auto, auto_surv_list)
print 'Precision: ', p_r[0]
print 'Recall: ', p_r[1]
print '----------------------------------------------'
In [120]:
clf = LogisticRegression()
clf.fit(titanic_df[['sex', 'age', 'fare']][pred_size:].as_matrix(), titanic_df['survived'][pred_size:])
plt.figure(figsize=(10,8))
plt.bar([1,2,3], clf.coef_[0], tick_label = ['sex', 'age', 'fare'])
plt.xticks([1.5,2.5,3.5])
Out[120]:
In [121]:
clf = LogisticRegression()
clf.fit(titanic_df[['sex', 'n_age', 'n_logfare']][pred_size:].as_matrix(), titanic_df['survived'][pred_size:])
plt.figure(figsize=(10,8))
plt.bar([1,2,3], clf.coef_[0], tick_label = ['sex', 'age', 'fare'])
plt.xticks([1.5,2.5,3.5])
Out[121]:
In [122]:
clf = RandomForestClassifier()
clf.fit(titanic_df[['sex', 'n_age', 'n_logfare']][pred_size:].as_matrix(), titanic_df['survived'][pred_size:])
plt.figure(figsize=(10,8))
plt.bar([1,2,3], clf.feature_importances_, tick_label = ['sex', 'age', 'fare'])
plt.xticks([1.5,2.5,3.5])
Out[122]:
In [ ]: